***** Further Clean: claim data


clear 
clear matrix
set memory 1000m
set more off
cap log close


cd "/Users/..."

global do_file="‎⁨/Users/.../do_file"
global log_file="/Users/.../log_file"
global raw_data="/Users/.../raw_data⁩⁩"
global working_data="/Users/.../working_data"
global results="/Users/.../results"




use working_data/claim_clean_070219.dta, clear




replace jiaoq=0 if jiaoq==. 
replace no_reimburse=0 if no_reimburse==.
replace cancelled=0 if cancelled==.

* 6 combinations (no reimburse and cancelled cannot be 1 at the same time)

* jiaoq
gen data_type=1 if jiaoq==1 & no_reimburse==0 & cancelled==0
replace data_type=2 if jiaoq==1 & no_reimburse==1 & cancelled==0
replace data_type=3 if jiaoq==1 & no_reimburse==0 & cancelled==1
 
* sy
replace data_type=4 if jiaoq==0 & no_reimburse==0 & cancelled==0
replace data_type=5 if jiaoq==0 & no_reimburse==1 & cancelled==0
replace data_type=6 if jiaoq==0 & no_reimburse==0 & cancelled==1
 
tab data_type, m



** fill in values using different records on the same claim

gsort vin claim_date -ins_company
bysort vin claim_date: replace ins_company=ins_company[_n-1] if ins_company=="" & ins_company[_n-1]~=""

gsort vin claim_date -accident_loc
bysort vin claim_date: replace accident_loc=accident_loc[_n-1] if accident_loc=="" & accident_loc[_n-1]~=""


gsort vin claim_date -repair_details
bysort vin claim_date: replace repair_details=repair_details[_n-1] if repair_details=="" & repair_details[_n-1]~=""


sort vin claim_date contact_police
bysort vin claim_date: replace contact_police=contact_police[_n-1] if contact_police==. & contact_police[_n-1]~=.


sort vin claim_date contact_med
bysort vin claim_date: replace contact_med=contact_med[_n-1] if contact_med==. & contact_med[_n-1]~=.


sort vin claim_date involve_others
bysort vin claim_date: replace involve_others=involve_others[_n-1] if involve_others==. & involve_others[_n-1]~=.


sort vin claim_date at_fault
bysort vin claim_date: replace at_fault=at_fault[_n-1] if at_fault==. & at_fault[_n-1]~=.


sort vin claim_date human_injury
bysort vin claim_date: replace human_injury=human_injury[_n-1] if human_injury==. & human_injury[_n-1]~=.

sort vin claim_date other_damage
bysort vin claim_date: replace other_damage=other_damage[_n-1] if other_damage==. & other_damage[_n-1]~=.

sort vin claim_date own_damage
bysort vin claim_date: replace own_damage=own_damage[_n-1] if own_damage==. & own_damage[_n-1]~=.

sort vin claim_date opp_damage
bysort vin claim_date: replace opp_damage=opp_damage[_n-1] if opp_damage==. & opp_damage[_n-1]~=.


sort vin claim_date num_parts_repaired
bysort vin claim_date: replace num_parts_repaired=num_parts_repaired[_n-1] if num_parts_repaired==. & num_parts_repaired[_n-1]~=.



sort vin claim_date data_type

bysort vin claim_date data_type: gen temp=_N
tab temp, m

list claim_details if temp==2


sort vin claim_date total_fee
gen temp_length=length(claim_details)
bysort vin claim_date data_type: drop if _n==1 & temp==2 & temp_length<=temp_length[_n+1]

drop temp temp_length

bysort vin claim_date data_type: gen temp=_N
tab temp, m

list claim_details if temp==2

gen temp_length=length(claim_details)
bysort vin claim_date data_type: egen min_length=min(temp_length) if temp==2

drop if min_length==temp_length & temp==2


list claim_details if temp>2
drop if total_fee==0 & temp==4
drop if (claim_details=="碰撞 / 在太阳山路188弄小区内因碰撞发生事故。(交强)" | claim_details=="GPIC / (交强)") & temp==3

drop temp temp_length min_length

bysort vin claim_date data_type: gen temp=_N
tab temp, m   // now each data only appear once for a claim
drop temp




** now generate dataset: each claim in one row


forvalues i=1(1)6{
gen total_fee_`i'=total_fee if data_type==`i'
gen labor_fee_`i'=labor_fee if data_type==`i'
gen material_fee_`i'=material_fee if data_type==`i'

gen cat_`i'=1 if data_type==`i'
bysort vin claim_date: egen total_cat_`i'=total(cat_`i')
drop cat_`i'
}


rename total_cat_1 jiaoq_new
rename total_cat_2 no_reimb_jiaoq
rename total_cat_3 cancelled_jiaoq
rename total_cat_4 sy_new
rename total_cat_5 no_reimb_sy
rename total_cat_6 cancelled_sy





* (1): deal with total fee

forvalues i=1(1)6{
bysort vin claim_date: egen temp`i'=total(total_fee_`i')
}

gen total_fee_jiaoq=temp1
replace total_fee_jiaoq=temp2 if total_fee_jiaoq==0 & temp2~=0
replace total_fee_jiaoq=temp3 if total_fee_jiaoq==0 & temp3~=0


gen total_fee_sy=temp4
replace total_fee_sy=temp5 if total_fee_sy==0 & temp5~=0
replace total_fee_sy=temp6 if total_fee_sy==0 & temp6~=0

drop temp1-temp6 


* (2): deal with labor fee

forvalues i=1(1)6{
bysort vin claim_date: egen temp`i'=total(labor_fee_`i')
}

gen labor_fee_jiaoq=temp1
replace labor_fee_jiaoq=temp2 if labor_fee_jiaoq==0 & temp2~=0
replace labor_fee_jiaoq=temp3 if labor_fee_jiaoq==0 & temp3~=0


gen labor_fee_sy=temp4
replace labor_fee_sy=temp5 if labor_fee_sy==0 & temp5~=0
replace labor_fee_sy=temp6 if labor_fee_sy==0 & temp6~=0

drop temp1-temp6 


* (2): deal with material fee

forvalues i=1(1)6{
bysort vin claim_date: egen temp`i'=total(material_fee_`i')
}

gen material_fee_jiaoq=temp1
replace material_fee_jiaoq=temp2 if material_fee_jiaoq==0 & temp2~=0
replace material_fee_jiaoq=temp3 if material_fee_jiaoq==0 & temp3~=0


gen material_fee_sy=temp4
replace material_fee_sy=temp5 if material_fee_sy==0 & temp5~=0
replace material_fee_sy=temp6 if material_fee_sy==0 & temp6~=0

drop temp1-temp6 


forvalues i=1(1)6{
drop total_fee_`i' labor_fee_`i' material_fee_`i'
}


bysort vin claim_date: keep if _n==1

drop total_fee labor_fee material_fee claim_details repair_details first second second_orig jiaoq no_reimburse cancelled accident_type


rename jiaoq_new jiaoq
rename sy_new sy
rename accident_type_new accident_type

gen calendar_day=claim_date

* make sure each day has only one claim
bysort vin calendar_day: gen temp=_N
tab temp, m
drop temp

count if claim_date==""

save working_data/claim_clean_further_07032019.dta, replace 










